started: AL13Jul2019
last updated: AL01ct2019

Summary

Eigenvectors calculated from 622 variants (non-rare, not in LD)
for 3,019 samples = 2,504 kgen + 515 wecare-nfe (258UBC, 257CBC)

Suggest 34 clear ethnic outliers using manually selected thresholds
(previously, using a smaller number of variants in ampliseq-nfe analysis,
there was 14 outliers more; it is possible to strignten thretholds though …)

Start_section

Sys.time()
## [1] "2019-10-01 20:10:19 BST"
rm(list=ls())
graphics.off()

library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s06_explore_joined_PCA_plots"

opts_knit$set(root.dir = base_folder)

options(stringsAsFactors = F)
options(warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)

#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588 

Read_data

# Sequencing data
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s01_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s06_explore_joined_PCA_plots"

# Eigenvectors and eigenvalues
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s05_calculate_joined_1kgp_ampliseq_PCs/data/s04_pca"
eigenvectors_file <- paste(source_folder, "ampliseq_1kg_622_3019_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues_file <- paste(source_folder, "ampliseq_1kg_622_3019_100PCs.eigenval", sep="/")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")

# IKGP phenotypes
source_folder="/Users/alexey/Documents/resources/1kgp"
kg_phenotypes_file <- paste(source_folder, "integrated_call_samples_v3.20130502.ALL.panel", sep="/")
kg_phenotypes.df <- read.table(kg_phenotypes_file, header=T)

# Thresholds to select outliers
# (manually selected on the basis of visual assessment of plots)

pc1_th <- 0.006
pc2_th <- 0.009

# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file, kg_phenotypes_file, genotypes.mx, variants.df)

Check data

ls()
## [1] "base_folder"      "eigenvalues.df"   "eigenvectors.df"  "kg_phenotypes.df" "pc1_th"           "pc2_th"           "phenotypes.df"
dim(eigenvectors.df)
## [1] 3019  102
dim(eigenvalues.df)
## [1] 100   1
dim(kg_phenotypes.df)
## [1] 2504    4
dim(phenotypes.df)
## [1] 515  24
table(phenotypes.df$cc)
## 
##   0   1 
## 258 257

Update eigenvectors

eigenvectors.df[1:5,1:5]
##       FID     IID        PC1         PC2         PC3
## 1 HG00096 HG00096 -0.0123803 -0.02318330 -0.01229230
## 2 HG00097 HG00097 -0.0116563 -0.01401060  0.00149094
## 3 HG00099 HG00099 -0.0101549 -0.00926998 -0.00632705
## 4 HG00100 HG00100 -0.0105016 -0.02286040 -0.00877107
## 5 HG00101 HG00101 -0.0113586 -0.01951960  0.00280684
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"sample" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
##          sample        PC1         PC2         PC3         PC4
## HG00096 HG00096 -0.0123803 -0.02318330 -0.01229230  0.01548540
## HG00097 HG00097 -0.0116563 -0.01401060  0.00149094  0.01210660
## HG00099 HG00099 -0.0101549 -0.00926998 -0.00632705 -0.01006840
## HG00100 HG00100 -0.0105016 -0.02286040 -0.00877107  0.00864473
## HG00101 HG00101 -0.0113586 -0.01951960  0.00280684  0.01152710

Look at eigenvalues

plot(eigenvalues.df$V1, type="b", ylab="Variance",
     main="Top 100 eigenvectors")

plot(eigenvalues.df$V1[1:10], type="b", ylab="Variance",
     main="Top 10 eigenvectors")

rm(eigenvalues.df)

Prepare data for Ampliseq-1kgp PCA plot

Expected order of samples:

# Check the expected order of samples
eigenvectors.df[c(2504,2505),c("sample","PC1")]
##                  sample         PC1
## NA21144         NA21144 -0.00761811
## 100_S8_L007 100_S8_L007 -0.01287310
# Prepare phenotypes for ampliseq-kgen data
phenotypes_ampliseq.df <- phenotypes.df[1:515,c("long_ids","cc")]
table(phenotypes_ampliseq.df$cc)
## 
##   0   1 
## 258 257
"UBC" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==0]
"CBC" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==1]
table(phenotypes_ampliseq.df$cc)
## 
## CBC UBC 
## 257 258
c("sample","group") -> colnames(phenotypes_ampliseq.df)

phenotypes_kgen.df <- kg_phenotypes.df[,c("sample","super_pop")]
c("sample","group") -> colnames(phenotypes_kgen.df)

phenotypes_ampliseq_kgen.df <- rbind(phenotypes_kgen.df,phenotypes_ampliseq.df)
table(phenotypes_ampliseq_kgen.df$group)
## 
## AFR AMR CBC EAS EUR SAS UBC 
## 661 347 257 504 503 489 258
# Add eigenvectors to phenotypes
eigenphen_ampliseq_kgen.df <- full_join(
  phenotypes_ampliseq_kgen.df, eigenvectors.df[,1:6], by="sample")
dim(eigenphen_ampliseq_kgen.df)
## [1] 3019    7
head(eigenphen_ampliseq_kgen.df)
##    sample group        PC1         PC2          PC3         PC4         PC5
## 1 HG00096   EUR -0.0123803 -0.02318330 -0.012292300  0.01548540  0.01744860
## 2 HG00097   EUR -0.0116563 -0.01401060  0.001490940  0.01210660  0.02883240
## 3 HG00099   EUR -0.0101549 -0.00926998 -0.006327050 -0.01006840  0.02283990
## 4 HG00100   EUR -0.0105016 -0.02286040 -0.008771070  0.00864473  0.00241337
## 5 HG00101   EUR -0.0113586 -0.01951960  0.002806840  0.01152710 -0.00587478
## 6 HG00102   EUR -0.0104850 -0.01412700  0.000701292  0.01694920  0.00885609
tail(eigenphen_ampliseq_kgen.df)
##            sample group         PC1        PC2         PC3        PC4          PC5
## 3014 95_S517_L008   CBC -0.01218660 -0.0047522 -0.00531965 0.01171110  0.000949141
## 3015 96_S236_L007   CBC -0.00991351 -0.0131943 -0.00128169 0.00433099  0.002379520
## 3016 97_S509_L008   UBC -0.01258500 -0.0143450 -0.01745760 0.00446267 -0.004143000
## 3017 98_S335_L008   UBC -0.01185330 -0.0136037 -0.00539617 0.00608450  0.008774560
## 3018 99_S418_L008   CBC -0.01344640 -0.0149487 -0.01010680 0.00732798 -0.004143990
## 3019  9_S346_L008   CBC -0.01171660 -0.0173992  0.01152830 0.02320550  0.016020500
# Clean-up
rm(kg_phenotypes.df, phenotypes_ampliseq.df, phenotypes_ampliseq_kgen.df, 
   phenotypes.df, phenotypes_kgen.df, eigenvectors.df)

Wecare PCA plot

# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
               "SAS"="GREY", "EAS"="AQUAMARINE", 
               "UBC"="SALMON", "CBC"="RED")

myColourScale <- scale_colour_manual(values=myColours)

# Static plot
ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group)) +
  labs(title="WECARE Ampliseq PCA",
    subtitle="based on 622 non-rare variants not in LD", x="PC1", y="PC2") +
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale

# Interactive plot
plotly_group <- factor(eigenphen_ampliseq_kgen.df$group,
  levels=c("AFR","AMR","EAS","SAS","EUR","UBC","CBC"))

g <- ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) + 
  geom_point(aes(col=plotly_group, text=sample)) +
  labs(title="WECARE Ampliseq PCA", x="PC1", y="PC2") +
  theme(legend.title=element_blank()) + # To suppress the legend title, otherwise it would be "plotly_group
  geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
  geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
  myColourScale  
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(myColours, myColourScale, plotly_group, g)

Make the list of ampliseq outliers

# Select ampliseq IDs
eigenphen_ampliseq_kgen.df$sample[2504:2505]
## [1] "NA21144"     "100_S8_L007"
ampliseq_ids <- eigenphen_ampliseq_kgen.df$sample[2505:3019]

# Select outliers
pc1_outliers <- eigenphen_ampliseq_kgen.df$PC1 > pc1_th & eigenphen_ampliseq_kgen.df$sample %in% ampliseq_ids
pc2_outliers <- eigenphen_ampliseq_kgen.df$PC2 > pc2_th & eigenphen_ampliseq_kgen.df$sample %in% ampliseq_ids
pca_outliers <- pc1_outliers | pc2_outliers

sum(pc1_outliers)
## [1] 20
sum(pc2_outliers)
## [1] 14
sum(pca_outliers)
## [1] 34
joined_outliers <- eigenphen_ampliseq_kgen.df[pca_outliers,"sample"]
joined_outliers
##  [1] "133_S168_L007" "141_S158_L007" "148_S432_L008" "16_S109_L007"  "235_S535_L008" "238_S520_L008" "246_S375_L008" "256_S513_L008" "267_S48_L007"  "273_S150_L007" "275_S22_L007"  "277_S292_L008" "285_S374_L008" "289_S69_L007"  "293_S479_L008" "308_S434_L008" "313_S362_L008" "323_S469_L008" "326_S317_L008" "329_S373_L008" "330_S409_L008" "347_S36_L007"  "352_S435_L008" "355_S365_L008" "366_S293_L008" "368_S46_L007"  "369_S230_L007" "372_S340_L008" "375_S140_L007" "385_S305_L008" "388_S259_L007" "3_S360_L008"   "403_S210_L007" "408_S130_L007"
# Compare with previous list of 48 outliers
previous_outliers <- c("103_S147_L007","11_S358_L008","133_S168_L007","139_S123_L007","141_S158_L007","146_S408_L008","148_S432_L008","16_S109_L007","1_S441_L008","213_S257_L007","235_S535_L008","238_S520_L008","246_S375_L008","248_S188_L007","256_S513_L008","267_S48_L007","273_S150_L007","275_S22_L007","276_S209_L007","277_S292_L008","285_S374_L008","287_S10_L007","289_S69_L007","293_S479_L008","308_S434_L008","313_S362_L008","323_S469_L008","326_S317_L008","329_S373_L008","330_S409_L008","347_S36_L007","352_S435_L008","355_S365_L008","35_S9_L007","366_S293_L008","368_S46_L007","369_S230_L007","372_S340_L008","375_S140_L007","376_S84_L007","381_S277_L008","385_S305_L008","388_S259_L007","3_S360_L008","403_S210_L007","407_S106_L007","408_S130_L007","94_S143_L007")

x <- setdiff(previous_outliers,joined_outliers)
length(x)
## [1] 14
x
##  [1] "103_S147_L007" "11_S358_L008"  "139_S123_L007" "146_S408_L008" "1_S441_L008"   "213_S257_L007" "248_S188_L007" "276_S209_L007" "287_S10_L007"  "35_S9_L007"    "376_S84_L007"  "381_S277_L008" "407_S106_L007" "94_S143_L007"
y <- setdiff(joined_outliers,previous_outliers)
length(y)
## [1] 0
y
## character(0)
z <- intersect(previous_outliers,joined_outliers)
length(z)
## [1] 34
z
##  [1] "133_S168_L007" "141_S158_L007" "148_S432_L008" "16_S109_L007"  "235_S535_L008" "238_S520_L008" "246_S375_L008" "256_S513_L008" "267_S48_L007"  "273_S150_L007" "275_S22_L007"  "277_S292_L008" "285_S374_L008" "289_S69_L007"  "293_S479_L008" "308_S434_L008" "313_S362_L008" "323_S469_L008" "326_S317_L008" "329_S373_L008" "330_S409_L008" "347_S36_L007"  "352_S435_L008" "355_S365_L008" "366_S293_L008" "368_S46_L007"  "369_S230_L007" "372_S340_L008" "375_S140_L007" "385_S305_L008" "388_S259_L007" "3_S360_L008"   "403_S210_L007" "408_S130_L007"
# Clean-up
rm(ampliseq_ids, pc1_outliers, pc2_outliers, pca_outliers, previous_outliers, x,y,z)

Save data

save.image(paste(base_folder, "s01_joined_PCA_plots_622_3019_not_rare_not_in_LD.RData", sep="/"))

Final_section

ls()
## [1] "base_folder"                "eigenphen_ampliseq_kgen.df" "joined_outliers"            "pc1_th"                     "pc2_th"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.9.0  ggplot2_3.2.0 dplyr_0.8.1   knitr_1.23   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        later_0.8.0       pillar_1.4.1      compiler_3.5.1    tools_3.5.1       digest_0.6.19     jsonlite_1.6      evaluate_0.14     tibble_2.1.3      gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.2   rlang_0.3.4       shiny_1.3.2       crosstalk_1.0.0   yaml_2.2.0        xfun_0.7          withr_2.1.2       stringr_1.4.0     httr_1.4.0        htmlwidgets_1.3   grid_3.5.1        tidyselect_0.2.5  glue_1.3.1        data.table_1.12.2 R6_2.4.0          rmarkdown_1.13    purrr_0.3.2       tidyr_0.8.3       magrittr_1.5      promises_1.0.1    scales_1.0.0      htmltools_0.3.6   assertthat_0.2.1  xtable_1.8-4      mime_0.7          colorspace_1.4-1  httpuv_1.5.1      labeling_0.3      stringi_1.4.3     lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4
Sys.time()
## [1] "2019-10-01 20:10:22 BST"